Call Libraries
library(tidyverse)
library(caret)
library(MASS)
library(car)
library(moments)
Calling the Transformed Datasets
income_cleaned = read_csv('NYS_Corp_Tax_Credit_data/income_cleaned.csv')
Rows: 1921 Columns: 6── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Name, Group
dbl (4): Year, Num, Amount, Avg
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
income_cleaned
industry_cleaned = read_csv('NYS_Corp_Tax_Credit_data/industry_cleaned.csv')
Rows: 2476 Columns: 6── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Name, Group
dbl (4): Year, Num, Amount, Avg
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
industry_cleaned
Creating the Models
sat.model.summary <- function (df, field, sat.formula){
#Shapiro-Wilks test to evaluate normality
print(shapiro.test(df[[field]]))
#Kurtosis evaluation (normal distribution has a value close to 3)
print('kurtosis')
print(kurtosis(df[[field]]))
linear.model.cleaned = lm(sat.formula, data = df)
print(summary(linear.model.cleaned))
plot(linear.model.cleaned)
#histograms of response variable to check distribution
print(df %>%
ggplot(aes_string(field)) +
geom_histogram() +
labs(title = 'Average Credit Amount Distribution') +
theme(plot.title = element_text(hjust = 0.5)))
#Checking multicollinearity using VIF measurement
print(vif(linear.model.cleaned))
influencePlot(linear.model.cleaned)
#avPlots(linear.model.cleaned)
}
sat.formula <- Avg ~ . - Amount
sat.field <- 'Avg'
sat.model.summary(income_cleaned, sat.field, sat.formula)
Shapiro-Wilk normality test
data: df[[field]]
W = 0.17297, p-value < 2.2e-16
[1] "kurtosis"
[1] 169.7518
Call:
lm(formula = sat.formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-11338570 -547702 94139 421302 87181761
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.157e+08 5.512e+07 -2.100 0.03590
Year 5.721e+04 2.731e+04 2.095 0.03634
NameAlternative Fuels and Electric Vehicle Recharging Property Credit -3.287e+05 1.242e+06 -0.265 0.79140
NameAlternative Minimum Tax Credit 6.538e+05 9.757e+05 0.670 0.50286
NameBeer Production Credit 3.316e+05 1.290e+06 0.257 0.79715
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 1.429e+06 9.960e+05 1.434 0.15162
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 4.098e+06 1.421e+06 2.883 0.00398
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 1.182e+06 9.927e+05 1.191 0.23386
NameBrownfield Tax Credits - Remediation Real Property Tax Credit -7.587e+04 9.920e+05 -0.076 0.93904
NameClean Heating Fuel Credit 7.148e+04 1.024e+06 0.070 0.94436
NameConservation Easement Tax Credit 1.143e+05 1.064e+06 0.107 0.91444
NameCredit for Employment of Persons with Disabilities -9.343e+05 1.119e+06 -0.835 0.40391
NameCredit for Purchase of an Automated External Defibrillator -1.449e+05 9.695e+05 -0.150 0.88117
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities 2.316e+05 1.779e+06 0.130 0.89645
NameEmpire State Apprentice Tax Credit -7.983e+05 1.524e+06 -0.524 0.60040
NameEmpire State Commercial Production Credit 2.183e+05 1.291e+06 0.169 0.86576
NameEmpire State Film Post Production Credit 2.521e+04 1.049e+06 0.024 0.98082
NameEmpire State Film Production Credit 1.145e+07 9.967e+05 11.485 < 2e-16
NameEmpire State Musical and Theatrical Production Credit -2.342e+05 1.775e+06 -0.132 0.89507
NameExcelsior Jobs Program Credit 1.035e+05 9.545e+05 0.108 0.91366
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit 2.752e+06 8.862e+05 3.105 0.00193
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit 4.845e+05 9.213e+05 0.526 0.59899
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes 1.245e+06 9.239e+05 1.348 0.17785
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners -1.025e+05 9.549e+05 -0.107 0.91455
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit -6.145e+04 9.387e+05 -0.065 0.94781
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners 9.119e+04 1.044e+06 0.087 0.93038
NameFarm Workforce Retention Credit -7.802e+03 1.285e+06 -0.006 0.99516
NameFarmers' School Tax Credit 1.440e+05 1.021e+06 0.141 0.88786
NameHire a Veteran Credit -1.238e+06 1.782e+06 -0.695 0.48721
NameHistoric Properties Rehabilitation Credit 1.352e+06 1.055e+06 1.282 0.20015
NameIndustrial or Manufacturing Business Tax Credit 4.997e+05 1.074e+06 0.465 0.64173
NameInvestment Tax Credit 4.463e+05 8.895e+05 0.502 0.61588
NameInvestment Tax Credit for the Financial Services Industry 3.976e+05 1.008e+06 0.395 0.69325
NameLife Sciences Research & Development Tax Credit -8.207e+04 2.099e+06 -0.039 0.96882
NameLong-Term Care Insurance Credit 1.246e+05 9.808e+05 0.127 0.89892
NameLow-Income Housing Credit -2.764e+04 1.113e+06 -0.025 0.98020
NameMinimum Wage Reimbursement Credit -2.931e+05 1.006e+06 -0.291 0.77075
NameMortgage Servicing Tax Credit -4.540e+05 1.085e+06 -0.418 0.67565
NameNew York Youth Jobs Program Tax Credit -2.587e+05 9.381e+05 -0.276 0.78279
NameQETC Capital Tax Credit 2.616e+05 1.386e+06 0.189 0.85032
NameQETC Employment Credit 1.335e+05 1.026e+06 0.130 0.89641
NameQETC Facilities, Operations, and Training Credit 5.153e+05 1.918e+06 0.269 0.78820
NameReal Property Tax Relief Credit for Manufacturing -2.635e+05 9.654e+05 -0.273 0.78490
NameSpecial Additional Mortgage Recording Tax Credit 3.678e+04 9.244e+05 0.040 0.96827
NameSTART-UP NY Tax Elimination Credit 4.061e+03 1.130e+06 0.004 0.99713
Group1,000,000 - 24,999,999 2.170e+05 3.501e+05 0.620 0.53548
Group100,000 - 499,999 1.352e+05 3.579e+05 0.378 0.70572
Group100,000,000 - 499,999,999 9.644e+05 3.937e+05 2.449 0.01441
Group25,000,000 - 49,999,999 4.519e+05 4.223e+05 1.070 0.28474
Group50,000,000 - 99,999,999 4.021e+05 4.235e+05 0.950 0.34247
Group500,000 - 999,999 2.195e+05 3.880e+05 0.566 0.57170
Group500,000,000 - and over 3.073e+06 3.846e+05 7.991 2.33e-15
GroupZero or Net Loss 9.536e+05 3.339e+05 2.856 0.00433
Num -4.520e+02 8.569e+02 -0.527 0.59791
(Intercept) *
Year *
NameAlternative Fuels and Electric Vehicle Recharging Property Credit
NameAlternative Minimum Tax Credit
NameBeer Production Credit
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 **
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08
NameBrownfield Tax Credits - Remediation Real Property Tax Credit
NameClean Heating Fuel Credit
NameConservation Easement Tax Credit
NameCredit for Employment of Persons with Disabilities
NameCredit for Purchase of an Automated External Defibrillator
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities
NameEmpire State Apprentice Tax Credit
NameEmpire State Commercial Production Credit
NameEmpire State Film Post Production Credit
NameEmpire State Film Production Credit ***
NameEmpire State Musical and Theatrical Production Credit
NameExcelsior Jobs Program Credit
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit **
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners
NameFarm Workforce Retention Credit
NameFarmers' School Tax Credit
NameHire a Veteran Credit
NameHistoric Properties Rehabilitation Credit
NameIndustrial or Manufacturing Business Tax Credit
NameInvestment Tax Credit
NameInvestment Tax Credit for the Financial Services Industry
NameLife Sciences Research & Development Tax Credit
NameLong-Term Care Insurance Credit
NameLow-Income Housing Credit
NameMinimum Wage Reimbursement Credit
NameMortgage Servicing Tax Credit
NameNew York Youth Jobs Program Tax Credit
NameQETC Capital Tax Credit
NameQETC Employment Credit
NameQETC Facilities, Operations, and Training Credit
NameReal Property Tax Relief Credit for Manufacturing
NameSpecial Additional Mortgage Recording Tax Credit
NameSTART-UP NY Tax Elimination Credit
Group1,000,000 - 24,999,999
Group100,000 - 499,999
Group100,000,000 - 499,999,999 *
Group25,000,000 - 49,999,999
Group50,000,000 - 99,999,999
Group500,000 - 999,999
Group500,000,000 - and over ***
GroupZero or Net Loss **
Num
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3868000 on 1867 degrees of freedom
Multiple R-squared: 0.2361, Adjusted R-squared: 0.2144
F-statistic: 10.88 on 53 and 1867 DF, p-value: < 2.2e-16
GVIF Df GVIF^(1/(2*Df))
Year 2.125869 1 1.458036
Name 3.521453 43 1.014746
Group 1.510763 8 1.026124
Num 1.645484 1 1.282764
income.model <- lm(sat.formula, data = income_cleaned)
sat.model.summary(industry_cleaned, sat.field, sat.formula)
Shapiro-Wilk normality test
data: df[[field]]
W = 0.22287, p-value < 2.2e-16
[1] "kurtosis"
[1] 135.5482
Call:
lm(formula = sat.formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-11581741 -371031 -23164 154182 28917959
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.926e+07 2.051e+07 -1.914 0.055677
Year 1.936e+04 1.016e+04 1.905 0.056959
NameAlternative Fuels and Electric Vehicle Recharging Property Credit 5.584e+04 5.062e+05 0.110 0.912180
NameAlternative Minimum Tax Credit 3.523e+05 4.208e+05 0.837 0.402556
NameBeer Production Credit 8.799e+04 6.261e+05 0.141 0.888249
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 1.919e+06 4.473e+05 4.290 1.86e-05
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 4.064e+06 7.278e+05 5.584 2.62e-08
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 1.054e+06 4.779e+05 2.206 0.027462
NameBrownfield Tax Credits - Remediation Real Property Tax Credit 1.327e+05 4.760e+05 0.279 0.780429
NameClean Heating Fuel Credit 2.785e+05 4.595e+05 0.606 0.544549
NameConservation Easement Tax Credit 2.743e+04 4.700e+05 0.058 0.953457
NameCredit for Employment of Persons with Disabilities 1.574e+05 5.069e+05 0.311 0.756142
NameCredit for Purchase of an Automated External Defibrillator 1.082e+05 4.465e+05 0.242 0.808639
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities 2.090e+05 8.286e+05 0.252 0.800866
NameEmpire State Apprentice Tax Credit -3.324e+05 1.012e+06 -0.329 0.742537
NameEmpire State Commercial Production Credit 3.524e+05 6.172e+05 0.571 0.568123
NameEmpire State Film Post Production Credit 5.332e+05 5.224e+05 1.021 0.307585
NameEmpire State Film Production Credit 1.170e+07 4.785e+05 24.458 < 2e-16
NameEmpire State Musical and Theatrical Production Credit 7.676e+04 9.010e+05 0.085 0.932112
NameExcelsior Jobs Program Credit 7.086e+05 4.393e+05 1.613 0.106849
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit 1.381e+06 4.318e+05 3.199 0.001399
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit 3.715e+05 4.213e+05 0.882 0.377942
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes 1.160e+06 4.192e+05 2.767 0.005700
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners 3.634e+05 4.311e+05 0.843 0.399359
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit 2.567e+05 4.265e+05 0.602 0.547350
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners 7.379e+04 5.041e+05 0.146 0.883634
NameFarm Workforce Retention Credit 2.727e+04 5.352e+05 0.051 0.959361
NameFarmers' School Tax Credit 1.287e+05 5.133e+05 0.251 0.802102
NameHire a Veteran Credit 1.459e+05 8.253e+05 0.177 0.859709
NameHistoric Properties Rehabilitation Credit 1.875e+06 4.841e+05 3.874 0.000110
NameInvestment Tax Credit 7.182e+05 4.129e+05 1.739 0.082075
NameInvestment Tax Credit for the Financial Services Industry 6.339e+05 5.757e+05 1.101 0.270913
NameLife Sciences Research & Development Tax Credit -2.243e+03 9.007e+05 -0.002 0.998014
NameLong-Term Care Insurance Credit 1.385e+05 4.200e+05 0.330 0.741537
NameLow-Income Housing Credit 1.642e+06 5.494e+05 2.988 0.002833
NameMinimum Wage Reimbursement Credit 9.624e+04 4.381e+05 0.220 0.826159
NameMortgage Servicing Tax Credit 2.077e+05 6.462e+05 0.321 0.747934
NameNew York Youth Jobs Program Tax Credit 1.953e+05 4.273e+05 0.457 0.647604
NameQETC Capital Tax Credit 2.986e+05 5.868e+05 0.509 0.610823
NameQETC Employment Credit 8.982e+04 4.465e+05 0.201 0.840607
NameQETC Facilities, Operations, and Training Credit 3.477e+05 6.711e+05 0.518 0.604441
NameReal Property Tax Relief Credit for Manufacturing 1.520e+05 4.423e+05 0.344 0.731171
NameSpecial Additional Mortgage Recording Tax Credit 2.609e+05 4.432e+05 0.589 0.556155
NameSTART-UP NY Tax Elimination Credit 1.703e+04 4.750e+05 0.036 0.971411
GroupAdministrative and Support and Waste Management and Remediation Services -2.280e+03 2.519e+05 -0.009 0.992777
GroupAdministrative/Support/Waste Management/Remediation Services -3.173e+03 2.798e+05 -0.011 0.990952
GroupAgriculture, Forestry, Fishing and Hunting 7.269e+04 2.330e+05 0.312 0.755097
GroupArts, Entertainment, and Recreation 5.866e+05 2.317e+05 2.532 0.011408
GroupConstruction -1.699e+04 2.171e+05 -0.078 0.937621
GroupEducational Services 5.569e+04 2.948e+05 0.189 0.850176
GroupFinance and Insurance 2.139e+05 2.034e+05 1.051 0.293237
GroupHealth Care and Social Assistance 1.300e+04 2.290e+05 0.057 0.954731
GroupInformation -2.433e+05 2.178e+05 -1.117 0.263917
GroupManagement of Companies and Enterprises 3.355e+05 1.949e+05 1.721 0.085332
GroupManufacturing 6.786e+05 2.028e+05 3.346 0.000831
GroupMining -1.561e+04 3.593e+05 -0.043 0.965352
GroupMining, Quarrying, and Oil and Gas Extraction 1.084e+05 3.011e+05 0.360 0.718901
GroupOther Services (except Public Administration) -6.462e+04 2.217e+05 -0.291 0.770697
GroupProfessional, Scientific, and Technical Services 3.736e+05 2.085e+05 1.792 0.073214
GroupReal Estate and Rental and Leasing 9.506e+04 2.044e+05 0.465 0.641882
GroupRetail Trade 4.746e+04 2.036e+05 0.233 0.815756
GroupTransportation and Warehousing -1.866e+04 2.300e+05 -0.081 0.935321
GroupUtilities 5.308e+05 2.633e+05 2.016 0.043917
GroupWholesale Trade 6.106e+04 2.081e+05 0.293 0.769261
Num -9.657e+02 4.272e+02 -2.260 0.023889
(Intercept) .
Year .
NameAlternative Fuels and Electric Vehicle Recharging Property Credit
NameAlternative Minimum Tax Credit
NameBeer Production Credit
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 *
NameBrownfield Tax Credits - Remediation Real Property Tax Credit
NameClean Heating Fuel Credit
NameConservation Easement Tax Credit
NameCredit for Employment of Persons with Disabilities
NameCredit for Purchase of an Automated External Defibrillator
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities
NameEmpire State Apprentice Tax Credit
NameEmpire State Commercial Production Credit
NameEmpire State Film Post Production Credit
NameEmpire State Film Production Credit ***
NameEmpire State Musical and Theatrical Production Credit
NameExcelsior Jobs Program Credit
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit **
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes **
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners
NameFarm Workforce Retention Credit
NameFarmers' School Tax Credit
NameHire a Veteran Credit
NameHistoric Properties Rehabilitation Credit ***
NameInvestment Tax Credit .
NameInvestment Tax Credit for the Financial Services Industry
NameLife Sciences Research & Development Tax Credit
NameLong-Term Care Insurance Credit
NameLow-Income Housing Credit **
NameMinimum Wage Reimbursement Credit
NameMortgage Servicing Tax Credit
NameNew York Youth Jobs Program Tax Credit
NameQETC Capital Tax Credit
NameQETC Employment Credit
NameQETC Facilities, Operations, and Training Credit
NameReal Property Tax Relief Credit for Manufacturing
NameSpecial Additional Mortgage Recording Tax Credit
NameSTART-UP NY Tax Elimination Credit
GroupAdministrative and Support and Waste Management and Remediation Services
GroupAdministrative/Support/Waste Management/Remediation Services
GroupAgriculture, Forestry, Fishing and Hunting
GroupArts, Entertainment, and Recreation *
GroupConstruction
GroupEducational Services
GroupFinance and Insurance
GroupHealth Care and Social Assistance
GroupInformation
GroupManagement of Companies and Enterprises .
GroupManufacturing ***
GroupMining
GroupMining, Quarrying, and Oil and Gas Extraction
GroupOther Services (except Public Administration)
GroupProfessional, Scientific, and Technical Services .
GroupReal Estate and Rental and Leasing
GroupRetail Trade
GroupTransportation and Warehousing
GroupUtilities *
GroupWholesale Trade
Num *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1612000 on 2411 degrees of freedom
Multiple R-squared: 0.4674, Adjusted R-squared: 0.4533
F-statistic: 33.06 on 64 and 2411 DF, p-value: < 2.2e-16
GVIF Df GVIF^(1/(2*Df))
Year 2.190806 1 1.480137
Name 5.185764 42 1.019787
Group 2.724217 20 1.025371
Num 1.370920 1 1.170863
industry.model <- lm(sat.formula, data = industry_cleaned)
Correcting violation of Normality in previous model with BoxCox transform
bc_func <- function (lm.cleaned, lambda.range){
bc = boxCox(lm.cleaned, lambda = lambda.range)
#Extracting the best lambda value.
return(bc$x[which(bc$y == max(bc$y))])
}
#Income Group Dataset
income.lambda.bc = bc_func(income.model, seq(-0.2, 0.2, 1/10))
income.lambda.bc
[1] -0.01414141
#Industry Group Dataset
industry.lambda.bc = bc_func(industry.model, seq(-0.2, 0.2, 1/10))
industry.lambda.bc
[1] -0.03434343
bc_transform <- function(df, lambda.bc){
return (df %>%
mutate(Avg.bc = (Avg^lambda.bc -1)/lambda.bc) %>%
select(-c(Avg, Amount)))
}
#Income Group Dataset
income_cleaned_bc <- bc_transform(income_cleaned, income.lambda.bc)
income.model.bc = lm(Avg.bc ~ ., data = income_cleaned_bc)
#Industry Group Dataset
industry_cleaned_bc <- bc_transform(industry_cleaned, industry.lambda.bc)
industry.model.bc = lm(Avg.bc ~ ., data = industry_cleaned_bc)
Checking linear regression assumptions for the transformed data.
sat.formula.bc <- Avg.bc ~ .
sat.field.bc <- 'Avg.bc'
#Income
sat.model.summary(income_cleaned_bc, sat.field.bc, sat.formula.bc)
Shapiro-Wilk normality test
data: df[[field]]
W = 0.99696, p-value = 0.000782
[1] "kurtosis"
[1] -0.2553313
Call:
lm(formula = sat.formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-6.6597 -0.5738 -0.0113 0.5668 4.4826
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.634e+01 1.456e+01 -2.496 0.012643
Year 2.288e-02 7.214e-03 3.171 0.001543
NameAlternative Fuels and Electric Vehicle Recharging Property Credit -8.643e-01 3.281e-01 -2.634 0.008510
NameAlternative Minimum Tax Credit -2.052e+00 2.577e-01 -7.962 2.91e-15
NameBeer Production Credit 5.687e-01 3.407e-01 1.669 0.095234
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 1.715e+00 2.630e-01 6.521 8.99e-11
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 2.676e+00 3.754e-01 7.130 1.43e-12
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 1.331e+00 2.622e-01 5.076 4.24e-07
NameBrownfield Tax Credits - Remediation Real Property Tax Credit 5.458e-02 2.620e-01 0.208 0.834991
NameClean Heating Fuel Credit -3.086e+00 2.705e-01 -11.409 < 2e-16
NameConservation Easement Tax Credit -2.137e+00 2.810e-01 -7.606 4.45e-14
NameCredit for Employment of Persons with Disabilities -3.118e+00 2.956e-01 -10.547 < 2e-16
NameCredit for Purchase of an Automated External Defibrillator -2.938e+00 2.560e-01 -11.473 < 2e-16
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities -5.673e-01 4.698e-01 -1.207 0.227430
NameEmpire State Apprentice Tax Credit -2.207e+00 4.024e-01 -5.485 4.71e-08
NameEmpire State Commercial Production Credit 6.075e-02 3.410e-01 0.178 0.858634
NameEmpire State Film Post Production Credit 1.101e+00 2.769e-01 3.977 7.25e-05
NameEmpire State Film Production Credit 2.853e+00 2.632e-01 10.838 < 2e-16
NameEmpire State Musical and Theatrical Production Credit 2.508e-01 4.688e-01 0.535 0.592803
NameExcelsior Jobs Program Credit 4.651e-01 2.521e-01 1.845 0.065178
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit 8.876e-01 2.341e-01 3.792 0.000154
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit 1.775e-01 2.433e-01 0.730 0.465701
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes 1.220e+00 2.440e-01 5.000 6.26e-07
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners 1.182e-01 2.522e-01 0.469 0.639260
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit -8.497e-01 2.479e-01 -3.427 0.000623
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners -1.358e+00 2.756e-01 -4.928 9.02e-07
NameFarm Workforce Retention Credit -1.612e+00 3.394e-01 -4.749 2.20e-06
NameFarmers' School Tax Credit -1.311e+00 2.696e-01 -4.863 1.26e-06
NameHire a Veteran Credit -2.915e+00 4.706e-01 -6.195 7.15e-10
NameHistoric Properties Rehabilitation Credit 1.918e+00 2.786e-01 6.886 7.81e-12
NameIndustrial or Manufacturing Business Tax Credit -1.718e+00 2.836e-01 -6.059 1.66e-09
NameInvestment Tax Credit 7.147e-02 2.349e-01 0.304 0.760985
NameInvestment Tax Credit for the Financial Services Industry 3.176e-01 2.662e-01 1.193 0.232870
NameLife Sciences Research & Development Tax Credit 7.033e-01 5.543e-01 1.269 0.204728
NameLong-Term Care Insurance Credit -2.863e+00 2.590e-01 -11.051 < 2e-16
NameLow-Income Housing Credit -9.063e-01 2.940e-01 -3.083 0.002080
NameMinimum Wage Reimbursement Credit -1.241e+00 2.656e-01 -4.674 3.17e-06
NameMortgage Servicing Tax Credit -9.378e-01 2.865e-01 -3.273 0.001084
NameNew York Youth Jobs Program Tax Credit -1.330e+00 2.478e-01 -5.367 9.01e-08
NameQETC Capital Tax Credit 4.561e-01 3.661e-01 1.246 0.212956
NameQETC Employment Credit -5.888e-01 2.709e-01 -2.174 0.029855
NameQETC Facilities, Operations, and Training Credit 5.799e-01 5.065e-01 1.145 0.252462
NameReal Property Tax Relief Credit for Manufacturing -1.518e+00 2.550e-01 -5.956 3.09e-09
NameSpecial Additional Mortgage Recording Tax Credit -2.309e-01 2.441e-01 -0.946 0.344374
NameSTART-UP NY Tax Elimination Credit -2.193e+00 2.985e-01 -7.345 3.05e-13
Group1,000,000 - 24,999,999 1.090e+00 9.246e-02 11.785 < 2e-16
Group100,000 - 499,999 3.590e-01 9.451e-02 3.798 0.000150
Group100,000,000 - 499,999,999 1.685e+00 1.040e-01 16.202 < 2e-16
Group25,000,000 - 49,999,999 1.325e+00 1.115e-01 11.883 < 2e-16
Group50,000,000 - 99,999,999 1.473e+00 1.118e-01 13.172 < 2e-16
Group500,000 - 999,999 6.032e-01 1.025e-01 5.886 4.67e-09
Group500,000,000 - and over 2.332e+00 1.016e-01 22.953 < 2e-16
GroupZero or Net Loss 9.934e-01 8.817e-02 11.267 < 2e-16
Num -1.533e-03 2.263e-04 -6.775 1.66e-11
(Intercept) *
Year **
NameAlternative Fuels and Electric Vehicle Recharging Property Credit **
NameAlternative Minimum Tax Credit ***
NameBeer Production Credit .
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 ***
NameBrownfield Tax Credits - Remediation Real Property Tax Credit
NameClean Heating Fuel Credit ***
NameConservation Easement Tax Credit ***
NameCredit for Employment of Persons with Disabilities ***
NameCredit for Purchase of an Automated External Defibrillator ***
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities
NameEmpire State Apprentice Tax Credit ***
NameEmpire State Commercial Production Credit
NameEmpire State Film Post Production Credit ***
NameEmpire State Film Production Credit ***
NameEmpire State Musical and Theatrical Production Credit
NameExcelsior Jobs Program Credit .
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit ***
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes ***
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit ***
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners ***
NameFarm Workforce Retention Credit ***
NameFarmers' School Tax Credit ***
NameHire a Veteran Credit ***
NameHistoric Properties Rehabilitation Credit ***
NameIndustrial or Manufacturing Business Tax Credit ***
NameInvestment Tax Credit
NameInvestment Tax Credit for the Financial Services Industry
NameLife Sciences Research & Development Tax Credit
NameLong-Term Care Insurance Credit ***
NameLow-Income Housing Credit **
NameMinimum Wage Reimbursement Credit ***
NameMortgage Servicing Tax Credit **
NameNew York Youth Jobs Program Tax Credit ***
NameQETC Capital Tax Credit
NameQETC Employment Credit *
NameQETC Facilities, Operations, and Training Credit
NameReal Property Tax Relief Credit for Manufacturing ***
NameSpecial Additional Mortgage Recording Tax Credit
NameSTART-UP NY Tax Elimination Credit ***
Group1,000,000 - 24,999,999 ***
Group100,000 - 499,999 ***
Group100,000,000 - 499,999,999 ***
Group25,000,000 - 49,999,999 ***
Group50,000,000 - 99,999,999 ***
Group500,000 - 999,999 ***
Group500,000,000 - and over ***
GroupZero or Net Loss ***
Num ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.022 on 1867 degrees of freedom
Multiple R-squared: 0.7368, Adjusted R-squared: 0.7293
F-statistic: 98.61 on 53 and 1867 DF, p-value: < 2.2e-16
GVIF Df GVIF^(1/(2*Df))
Year 2.125869 1 1.458036
Name 3.521453 43 1.014746
Group 1.510763 8 1.026124
Num 1.645484 1 1.282764
#Industry
sat.model.summary(industry_cleaned_bc, sat.field.bc, sat.formula.bc)
Shapiro-Wilk normality test
data: df[[field]]
W = 0.9902, p-value = 5.826e-12
[1] "kurtosis"
[1] -0.4869026
Call:
lm(formula = sat.formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-6.2888 -0.4697 0.0183 0.4928 3.9068
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -6.095e+01 1.107e+01 -5.505 4.09e-08
Year 3.440e-02 5.488e-03 6.269 4.30e-10
NameAlternative Fuels and Electric Vehicle Recharging Property Credit 2.665e-01 2.733e-01 0.975 0.329737
NameAlternative Minimum Tax Credit -2.463e+00 2.272e-01 -10.839 < 2e-16
NameBeer Production Credit 6.334e-01 3.381e-01 1.873 0.061126
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 2.198e+00 2.415e-01 9.100 < 2e-16
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 2.727e+00 3.930e-01 6.939 5.06e-12
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 1.586e+00 2.581e-01 6.148 9.17e-10
NameBrownfield Tax Credits - Remediation Real Property Tax Credit 9.695e-01 2.570e-01 3.772 0.000166
NameClean Heating Fuel Credit -2.544e+00 2.481e-01 -10.252 < 2e-16
NameConservation Easement Tax Credit -1.123e+00 2.538e-01 -4.423 1.02e-05
NameCredit for Employment of Persons with Disabilities -1.245e+00 2.737e-01 -4.549 5.67e-06
NameCredit for Purchase of an Automated External Defibrillator -1.535e+00 2.411e-01 -6.368 2.29e-10
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities -8.171e-02 4.474e-01 -0.183 0.855119
NameEmpire State Apprentice Tax Credit -8.683e-01 5.464e-01 -1.589 0.112125
NameEmpire State Commercial Production Credit 6.235e-01 3.333e-01 1.871 0.061481
NameEmpire State Film Post Production Credit 1.439e+00 2.821e-01 5.100 3.66e-07
NameEmpire State Film Production Credit 3.232e+00 2.584e-01 12.509 < 2e-16
NameEmpire State Musical and Theatrical Production Credit 1.012e+00 4.865e-01 2.080 0.037608
NameExcelsior Jobs Program Credit 1.643e+00 2.372e-01 6.929 5.42e-12
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit 1.310e+00 2.332e-01 5.617 2.17e-08
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit 7.127e-01 2.275e-01 3.133 0.001750
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes 1.707e+00 2.264e-01 7.540 6.61e-14
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners 9.777e-01 2.328e-01 4.200 2.77e-05
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit 1.609e-01 2.303e-01 0.699 0.484921
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners -5.241e-01 2.722e-01 -1.925 0.054320
NameFarm Workforce Retention Credit -8.623e-01 2.890e-01 -2.984 0.002876
NameFarmers' School Tax Credit -1.158e+00 2.771e-01 -4.179 3.03e-05
NameHire a Veteran Credit -1.040e+00 4.456e-01 -2.335 0.019638
NameHistoric Properties Rehabilitation Credit 2.397e+00 2.614e-01 9.172 < 2e-16
NameInvestment Tax Credit 6.981e-01 2.230e-01 3.131 0.001763
NameInvestment Tax Credit for the Financial Services Industry 1.560e+00 3.108e-01 5.019 5.59e-07
NameLife Sciences Research & Development Tax Credit 9.057e-01 4.864e-01 1.862 0.062697
NameLong-Term Care Insurance Credit -1.867e+00 2.268e-01 -8.231 3.01e-16
NameLow-Income Housing Credit 1.331e+00 2.967e-01 4.487 7.57e-06
NameMinimum Wage Reimbursement Credit -9.161e-01 2.366e-01 -3.872 0.000111
NameMortgage Servicing Tax Credit 8.932e-01 3.489e-01 2.560 0.010534
NameNew York Youth Jobs Program Tax Credit 1.263e-01 2.307e-01 0.548 0.584069
NameQETC Capital Tax Credit 1.162e+00 3.168e-01 3.667 0.000251
NameQETC Employment Credit -3.048e-01 2.411e-01 -1.264 0.206249
NameQETC Facilities, Operations, and Training Credit 1.147e+00 3.624e-01 3.165 0.001571
NameReal Property Tax Relief Credit for Manufacturing -6.559e-01 2.388e-01 -2.746 0.006077
NameSpecial Additional Mortgage Recording Tax Credit 7.505e-01 2.393e-01 3.136 0.001736
NameSTART-UP NY Tax Elimination Credit -1.834e+00 2.565e-01 -7.151 1.14e-12
GroupAdministrative and Support and Waste Management and Remediation Services 2.878e-01 1.360e-01 2.116 0.034417
GroupAdministrative/Support/Waste Management/Remediation Services 1.675e-01 1.511e-01 1.109 0.267612
GroupAgriculture, Forestry, Fishing and Hunting -1.132e-01 1.258e-01 -0.899 0.368486
GroupArts, Entertainment, and Recreation 5.595e-01 1.251e-01 4.473 8.09e-06
GroupConstruction -4.299e-02 1.172e-01 -0.367 0.713860
GroupEducational Services 2.253e-01 1.592e-01 1.415 0.157160
GroupFinance and Insurance 5.350e-01 1.099e-01 4.870 1.19e-06
GroupHealth Care and Social Assistance -6.777e-02 1.237e-01 -0.548 0.583720
GroupInformation 7.098e-01 1.176e-01 6.037 1.81e-09
GroupManagement of Companies and Enterprises 6.672e-01 1.052e-01 6.340 2.73e-10
GroupManufacturing 4.608e-01 1.095e-01 4.208 2.67e-05
GroupMining 2.515e-01 1.940e-01 1.296 0.194977
GroupMining, Quarrying, and Oil and Gas Extraction 3.869e-01 1.626e-01 2.379 0.017421
GroupOther Services (except Public Administration) -1.550e-01 1.197e-01 -1.295 0.195482
GroupProfessional, Scientific, and Technical Services 4.875e-01 1.126e-01 4.331 1.55e-05
GroupReal Estate and Rental and Leasing 1.461e-01 1.104e-01 1.324 0.185596
GroupRetail Trade 3.754e-01 1.100e-01 3.414 0.000650
GroupTransportation and Warehousing 2.125e-01 1.242e-01 1.711 0.087179
GroupUtilities 6.972e-01 1.422e-01 4.904 1.00e-06
GroupWholesale Trade 4.367e-01 1.124e-01 3.886 0.000105
Num 4.542e-04 2.307e-04 1.969 0.049082
(Intercept) ***
Year ***
NameAlternative Fuels and Electric Vehicle Recharging Property Credit
NameAlternative Minimum Tax Credit ***
NameBeer Production Credit .
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 ***
NameBrownfield Tax Credits - Remediation Real Property Tax Credit ***
NameClean Heating Fuel Credit ***
NameConservation Easement Tax Credit ***
NameCredit for Employment of Persons with Disabilities ***
NameCredit for Purchase of an Automated External Defibrillator ***
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities
NameEmpire State Apprentice Tax Credit
NameEmpire State Commercial Production Credit .
NameEmpire State Film Post Production Credit ***
NameEmpire State Film Production Credit ***
NameEmpire State Musical and Theatrical Production Credit *
NameExcelsior Jobs Program Credit ***
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit ***
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit **
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes ***
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners ***
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners .
NameFarm Workforce Retention Credit **
NameFarmers' School Tax Credit ***
NameHire a Veteran Credit *
NameHistoric Properties Rehabilitation Credit ***
NameInvestment Tax Credit **
NameInvestment Tax Credit for the Financial Services Industry ***
NameLife Sciences Research & Development Tax Credit .
NameLong-Term Care Insurance Credit ***
NameLow-Income Housing Credit ***
NameMinimum Wage Reimbursement Credit ***
NameMortgage Servicing Tax Credit *
NameNew York Youth Jobs Program Tax Credit
NameQETC Capital Tax Credit ***
NameQETC Employment Credit
NameQETC Facilities, Operations, and Training Credit **
NameReal Property Tax Relief Credit for Manufacturing **
NameSpecial Additional Mortgage Recording Tax Credit **
NameSTART-UP NY Tax Elimination Credit ***
GroupAdministrative and Support and Waste Management and Remediation Services *
GroupAdministrative/Support/Waste Management/Remediation Services
GroupAgriculture, Forestry, Fishing and Hunting
GroupArts, Entertainment, and Recreation ***
GroupConstruction
GroupEducational Services
GroupFinance and Insurance ***
GroupHealth Care and Social Assistance
GroupInformation ***
GroupManagement of Companies and Enterprises ***
GroupManufacturing ***
GroupMining
GroupMining, Quarrying, and Oil and Gas Extraction *
GroupOther Services (except Public Administration)
GroupProfessional, Scientific, and Technical Services ***
GroupReal Estate and Rental and Leasing
GroupRetail Trade ***
GroupTransportation and Warehousing .
GroupUtilities ***
GroupWholesale Trade ***
Num *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8703 on 2411 degrees of freedom
Multiple R-squared: 0.7631, Adjusted R-squared: 0.7569
F-statistic: 121.4 on 64 and 2411 DF, p-value: < 2.2e-16
GVIF Df GVIF^(1/(2*Df))
Year 2.190806 1 1.480137
Name 5.185764 42 1.019787
Group 2.724217 20 1.025371
Num 1.370920 1 1.170863
BIC comparison before and after BoxCox transform
BIC(income.model.bc, income.model)
BIC(industry.model.bc, industry.model)
Stepwise Regression on Income_cat_bc (boxcox transformed dataset)
dim(industry.dummy.bc)
[1] 2476 65
Cleaning column names further so stepwise regression doesn’t present any errors
#Income Group Dataset
income.dummy.bc <- dummy_func(income_cleaned_bc)
#colnames(income.dummy.bc)[37] <- 'NameManufactureru0092s_Real_Property_Tax_Credit'
colnames(income.dummy.bc)
#Industry Group Dataset
industry.dummy.bc <- dummy_func(industry_cleaned_bc)
#colnames(industry.dummy.bc)[35] <- 'NameManufactureru0092s_Real_Property_Tax_Credit'
colnames(industry.dummy.bc)
Stepwise regression using BIC as the criteria (k = log(n)).
#Creating Stepwise Models
bcs = list(income = income.dummy.bc, industry = industry.dummy.bc)
model.fulls = list(income = lm(Avg.bc ~ ., data = income.dummy.bc), industry = lm(Avg.bc ~ ., data = industry.dummy.bc)) #All variables
model.emptys = list(income = lm(Avg.bc ~ 1, data = income.dummy.bc), industry = lm(Avg.bc ~ 1, data = industry.dummy.bc)) #intercept only
k = c('income', 'industry')
forwardBIC = list(income = NULL, industry = NULL)
backwardBIC = list(income = NULL, industry = NULL)
for (i in k){
bc = bcs[[i]]
scope = list(lower = formula(model.emptys[[i]]), upper = formula(model.fulls[[i]]))
n_obs = bc %>% count() %>% dplyr::first()
forwardBIC[[i]] = step(model.emptys[[i]], scope, direction = "forward", k = log(n_obs))
backwardBIC[[i]] = step(model.fulls[[i]], scope, direction = "backward", k = log(n_obs))
}
Selecting Best Formula per Dataset from Stepwise Regressions
bic_func(forwardBIC[['income']])
[1] "Adjusted R Squared:"
[1] 0.7287541
[1] "Number of Coefficients:"
[1] 41
[1] "VIF Check: "
[1] 1.867767
[1] "*************************"
bic_func(backwardBIC[['income']])
[1] "Adjusted R Squared:"
[1] 0.7287541
[1] "Number of Coefficients:"
[1] 41
[1] "VIF Check: "
[1] 1.867767
[1] "*************************"
bic_func(forwardBIC[['industry']])
[1] "Adjusted R Squared:"
[1] 0.7532731
[1] "Number of Coefficients:"
[1] 42
[1] "VIF Check: "
[1] 1.594046
[1] "*************************"
bic_func(backwardBIC[['industry']])
[1] "Adjusted R Squared:"
[1] 0.7527448
[1] "Number of Coefficients:"
[1] 43
[1] "VIF Check: "
[1] 2.244589
[1] "*************************"
Best Model Selection from Stepwise
#The Best Models selected for both income and industry were forwardBIC.
#Income Group Dataset
income.best.formula <- forwardBIC[['income']]$call[[2]]
income.best.formula
#Industry Group Dataset
industry.best.formula <- forwardBIC[['industry']]$call[[2]]
industry.best.formula
Splitting data up into test data and training data (test data is for year 2019, training is the rest)
test_train_split <- function(dummy_bc, best.formula) {
# data.test <- dummy_bc %>% filter(Year == 2019)
# data.train <- dummy_bc %>% filter(Year != 2019)
set.seed(0)
train.i = sample(1:nrow(dummy_bc), 0.8*nrow(dummy_bc), replace = F)
data.train <- dummy_bc %>% slice(train.i)
data.test <- dummy_bc %>% slice(-train.i)
#train
#X.train <- as.matrix(data.train %>% select(-Avg.bc))
X.train <- model.matrix(best.formula, data = data.train)[,-1]
y.train <- as.matrix(data.train %>% select(Avg.bc))
#test
#X.test <- as.matrix(data.test %>% select(-Avg.bc))
X.test <- model.matrix(best.formula, data = data.test)[,-1]
y.test <- as.matrix(data.test %>% select(Avg.bc))
return (list('X.train' = X.train, 'y.train' = y.train, 'X.test' = X.test, 'y.test' = y.test, 'data.train' = data.train, 'data.test' = data.test))
}
#Income
income.data.split.sat <- test_train_split(income.dummy.bc, Avg.bc ~ .)
income.data.split <- test_train_split(income.dummy.bc, income.best.formula)
#Industry
industry.data.split.sat <- test_train_split(industry.dummy.bc, Avg.bc ~ .)
industry.data.split <- test_train_split(industry.dummy.bc, industry.best.formula)
Lasso regression for comparison to Forward Stepwise
# calc_MSE <- function (model, x.test, y.test){
# y.predict <- predict(model, newdata = x.test)
# return(mean((y.predict - y.test)^2))
# }
xy.splits <- list('income' = income.data.split,
'industry' = industry.data.split,
'income.sat' = income.data.split.sat,
'industry.sat' = industry.data.split.sat)
lasso.list <- c('income', 'industry', 'income.sat', 'industry.sat')
baseline.list <- c('income.sat', 'industry.sat', 'income.sat', 'industry.sat')
for (a in 1:4){
i = lasso.list[a]
X.train <- xy.splits[[i]][['X.train']]
y.train <- xy.splits[[i]][['y.train']]
X.test <- xy.splits[[i]][['X.test']]
y.test <- xy.splits[[i]][['y.test']]
#create lambda grid
lambda.grid = 10^seq(2, -5, length = 100)
#create lasso models with lambda.grid
lasso.models = glmnet(X.train, y.train, alpha = 1, lambda = lambda.grid)
#visualize coefficient shrinkage
plot(lasso.models, xvar = "lambda", label = TRUE, main = paste("Lasso Regression:", i))
#Cross Validation to find best lambda
set.seed(0)
cv.lasso.models <- cv.glmnet(X.train, y.train, alpha = 1, lambda = lambda.grid, nfolds = 10)
#visualize cross validation for lambda that minimizes the mean squared error.
plot(cv.lasso.models, main = paste("Lasso Regression:", i))
#Checking the best lambda
log(cv.lasso.models$lambda.min)
best.lambda <- cv.lasso.models$lambda.min
print(paste(i, ' best.lambda:', best.lambda))
# best lambda with all the variables was found to be 0.0006892612
# best lambda with only the bwdBIC coefficients included was found to be 0.0003053856
#looking at the lasso coefficients for the best.lambda
best.lambda.coeff <- predict(lasso.models, s = best.lambda, type = "coefficients")
print('Number of Coefficients:')
print(dim(best.lambda.coeff)[1])
#fitting a model with the best lambda found to be 0.000689 and using it to make predictions for the testing data.
lasso.best.lambda.train.pred <- predict(lasso.models, s = best.lambda, newx = X.test)
lasso.best.lambda.train.pred
#checking MSE
MSE.lasso <- mean((lasso.best.lambda.train.pred - y.test)^2)
sat.model.bc <- lm(Avg.bc ~., data = xy.splits[[i]][['data.train']])
b = baseline.list[a]
temp.df <- as.data.frame(xy.splits[[b]][['X.test']]) #temp fix
colnames(temp.df) <- str_replace_all(colnames(temp.df), "[`]", '')
y.predict <- predict(sat.model.bc, newdata = temp.df)
MSE.sat <- mean((y.predict - xy.splits[[b]][['y.test']])^2)
print(paste(i, ' Lasso MSE: ', MSE.lasso, ' Saturated MSE: ', MSE.sat))
print('********************************')
}
[1] "income best.lambda: 0.000585702081805667"
[1] "Number of Coefficients:"
[1] 41
[1] "income Lasso MSE: 1.07302969291217 Saturated MSE: 1.06487990945569"
[1] "********************************"
[1] "industry best.lambda: 0.00112332403297803"
[1] "Number of Coefficients:"
[1] 42
[1] "industry Lasso MSE: 0.8144055528551 Saturated MSE: 0.798045495313799"
[1] "********************************"
[1] "income.sat best.lambda: 0.00155567614393047"
[1] "Number of Coefficients:"
[1] 54
[1] "income.sat Lasso MSE: 1.06539192794953 Saturated MSE: 1.06487990945569"
[1] "********************************"
[1] "industry.sat best.lambda: 0.00215443469003189"
[1] "Number of Coefficients:"
[1] 65
[1] "industry.sat Lasso MSE: 0.799220921238181 Saturated MSE: 0.798045495313799"
[1] "********************************"
# sat.model.bc <- lm(Avg.bc ~., data = xy.splits.sat[['industry']][['data.train']])
#
# temp.df <- as.data.frame(xy.splits.sat[['industry']][['X.test']])
#
# colnames(temp.df)[57]
#
# colnames(temp.df) <- str_replace_all(colnames(temp.df), "[`]", '')
# colnames(temp.df)
#
# y.predict <- predict(sat.model.bc, newdata = temp.df)
# MSE.sat <- mean((y.predict - xy.splits.sat[['industry']][['y.test']])^2)
#
# print(paste(i, ' Lasso MSE: ', MSE.lasso, ' Saturated MSE: ', MSE.sat))
#
# summary(sat.model.bc)$coefficients
# as.data.frame(xy.splits.sat[['industry']][['X.test']]) %>% select(starts_with('`GroupOth'))
as.data.frame(lasso.best.lambda.train.pred) %>% mutate(Avg_in_dollars = (lasso.best.lambda.train.pred*lambda.bc+1)^(1/lambda.bc))
calc_MSE(sat.model.bc, as.data.frame(income.sat.data.split[['X.test']]), income.sat.data.split[['y.test']])
[1] 1.473686